In [1]:
import os
import time
import math
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import multiprocessing
from pandarallel import pandarallel
import requests
import sys

import nltk
from textblob import TextBlob
from wordcloud import WordCloud
from google.cloud import storage
from textblob.sentiments import NaiveBayesAnalyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import spacy
from collections import Counter
import concurrent.futures

import warnings

warnings.simplefilter('once')
warnings.simplefilter('ignore')
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)
In [3]:
num_processors = multiprocessing.cpu_count()
num_processors

workers = num_processors-1

print(f'Using {workers} workers')
Using 15 workers
In [4]:
pandarallel.initialize(nb_workers=workers, use_memory_fs=False, progress_bar=True)
INFO: Pandarallel will run on 15 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

1. Import Data¶

In [5]:
%%time

file_path = 'news_cleaned.parquet'
news = pd.read_parquet(file_path)
CPU times: user 19.8 s, sys: 29.5 s, total: 49.2 s
Wall time: 36.9 s
In [6]:
news.shape # (198064, 16)
Out[6]:
(198064, 16)
In [7]:
news.columns
Out[7]:
Index(['url', 'date', 'language', 'title', 'text', 'year', 'month', 'day',
       'text_ner', 'text_cleaned', 'text_lemm', 'title_ner', 'title_cleaned',
       'title_lemm', 'title_word_count', 'text_word_count'],
      dtype='object')
In [8]:
news.sample(1, random_state = 42)[['text_ner', 'text_cleaned', 'text_lemm', 'title_ner', 'title_cleaned', 'title_lemm']]
Out[8]:
text_ner text_cleaned text_lemm title_ner title_cleaned title_lemm
196666 Prosecutors in all states urge Congress to strengthen tools to fight AI child sexual abuse images Skip to contentCommunity Coverage TourHome ProMedically SpeakingBest of the WestChampions in AgBack to Our AppsCOVID 19Food for NewsTexasNew to a TipLatest CamsClosings and DelaysSend Us Your Weather PhotosTxDOT Highway ConditionsDownload the Weather AppWeather ResourcesKCBD InvestigatesSubmit a TipChad Read ShootingReagor Dykes CoverageSex Trafficking on the South PlainsLubbock County Medical E... prosecutors states urge congress strengthen tools fight ai child sexual abuse images skip contentcommunity coverage tourhome promedically speakingbest westchampions agback appscovid newstexasnew tiplatest camsclosings delayssend us weather photostxdot highway conditionsdownload weather appweather resourceskcbd investigatessubmit tipchad read shootingreagor dykes coveragesex trafficking south plainslubbock county medical examiner school beat petestats predictionshow watchcommunitytell somethi... prosecutor state urge congress strengthen tool fight ai child sexual abuse image skip contentcommunity coverage tourhome promedically speakingbest westchampions agback appscovid newstexasnew tiplatest camsclosings delayssend u weather photostxdot highway conditionsdownload weather appweather resourceskcbd investigatessubmit tipchad read shootingreagor dyke coveragesex traffic south plainslubbock county medical examiner school beat petestats predictionshow watchcommunitytell something goodnot... Prosecutors in all states urge Congress to strengthen tools to fight AI child sexual abuse images prosecutors states urge congress strengthen tools fight ai child sexual abuse images prosecutor state urge congress strengthen tool fight ai child sexual abuse image

2. Sentiment Analysis with TextBlob: Polarity and Subjectivity¶

textblob.sentiments module contains two sentiment analysis implementations

  • PatternAnalyzer (based on the pattern library: https://www.clips.uantwerpen.be/pattern)
  • NaiveBayesAnalyzer (an NLTK classifier trained on a movie reviews corpus).

The default implementation is PatternAnalyzer, but you can override the analyzer to use NaiveBayesAnalyzer

Polarity and Subjectivity:

  • Polarity is float which lies in the range of [-1,1] where 1 means positive statement and -1 means a negative statement
  • Subjective sentences generally refer to personal opinion, emotion or judgment whereas objective refers to factual information. Subjectivity is also a float which lies in the range of [0,1]
In [9]:
# Function to analyze sentiment and categorize based on polarity
def analyze_and_categorize_sentiment(text):
    sentiment = TextBlob(text).sentiment
    polarity = sentiment.polarity
    subjectivity = sentiment.subjectivity
    if polarity > 0:
        sentiment_label = 'positive'
    elif polarity < 0:
        sentiment_label = 'negative'
    else:
        sentiment_label = 'neutral'
    return sentiment_label, polarity, subjectivity
In [10]:
%%time

# Apply the function in parallel
results = news['text_cleaned'].parallel_apply(analyze_and_categorize_sentiment)
VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=13205), Label(value='0 / 13205')))…
CPU times: user 9.08 s, sys: 5.81 s, total: 14.9 s
Wall time: 1min 48s
In [11]:
# Create a DataFrame from the results
df_sentiments_textblob = pd.DataFrame(results.tolist(), columns=['tblob_sent', 'tblob_score', 'tblob_sub'])
In [12]:
news = news.join(df_sentiments_textblob)
In [13]:
news[news['tblob_sent'] == 'positive'][['text_ner', 'tblob_sent', 'tblob_score', 'tblob_sub']].sample(3, random_state = 42)
Out[13]:
text_ner tblob_sent tblob_score tblob_sub
4571 Europe s bid for AI standard faces long road, EU lawmakers say NewsBreakSign ArtTV SeriesBooks DanceBehind Viral VideosPerforming ArtsTV MusicHip. HealthHealth ServicesMental HealthDiseases s HealthCancerFood SportsPremier DrinksPetsBeauty SafetyPublic SafetyAccidentsLaw EnforcementTraffic AdviceFamily RentLabor IssuesTrouble ScienceEarth NationsMiddle locations, channels, topics, people ... inReuters Follow321K Followers285K Post146M ViewsABOUTReuters provides award winning coverage of the ... positive 0.053655 0.285548
111661 ChatGPT app for smartphones now available in India Know how to download DH Latest News, DH NEWS, Latest News, NEWS, Technology, Apple iPhone X, iPhone users, ChatGPT, chatbot ChatGPT, creators ChatGPT, Aplle iPhone Sunday, May Breaking ChatGPT app for smartphones now available in India Know how to download Public sector bank hikes interest rates on fixed deposits The house where Nazi dictator Hitler was born to be converted into a human rights training centre Here is the timing, schedule and... positive 0.193479 0.355669
124068 MOMENTUM GLOBAL INVESTMENT MANAGEMENT ANNOUNCES STRATEGIC PARTNERSHIP WITH MDOTM LTD TO DEVELOP ARTIFICIAL INTELLIGENCE AI CAPABILITIES AND INSIGHTS DRIVEN INVESTMENT SOLUTIONS Skip to contentNewsElection ClipsLive and CamsClosings DelaysFish Game ForecastFirst Alert Weather ClassesSportsSports ConnectionFootball Friday TopsPrepSpinSign Up for eNewsJob WatchContestsVideo ClipsLive StreamLatest ScheduleContact UsMeet the News TeamAdvertise with UsSubmit a StorySubmit Photo or VideoSubmit Birt... positive 0.049083 0.403997
In [14]:
news[news['tblob_sent'] == 'negative'][['text_ner', 'tblob_sent', 'tblob_score', 'tblob_sub']].sample(3, random_state = 42)
Out[14]:
text_ner tblob_sent tblob_score tblob_sub
110487 Snag a Lifetime Subscription to Jott Pro AI Text Speech Toolkit for a Massive OffThe InventoryThe A.V. RootThe TakeoutThe OnionIt s all of the DayBest Amazon DealsKinja GoodsWe may earn a commission from links on this page.It s all of the DayBest Amazon DealsKinja GoodsTechSnag a Lifetime Subscription to Jott Pro AI Text Speech Toolkit for a Massive OffState of the art, AI driven tech will streamline your workflow, make you more efficient and reduce human error.ByWilliam HelmsPublished6 minu... negative -0.076507 0.661994
158858 Artificial Intelligence in Retail Market is Enhanced by Inception of Exponential Technologies such as Sensors, Robotics, Virtual Reality Jewish Market Reports Skip to content Saturday, April, Contact Jewish Market Reports Jewish Market Research News Market Sales Industry Analysis Market Size Market Report Market Outlook Industry Growth Contact You are hereHomeNews2020 Artificial Intelligence in Retail Market is Enhanced by Inception of Exponential Technologies such as Sensors, Robotics, Virt... negative -0.075549 0.538839
191646 Artificial Intelligence Market Promising Growth Opportunities over TechNews.mobi Market Reports Skip to content TechNews.mobi Market Reports Reporting about the Technology Market Space Electric News NASA Satellite Climate Market Forecast Industry Analysis Market Reports Contact Us Space Electric News NASA Satellite Climate Market Forecast Industry Analysis Market Reports Contact Us Artificial Intelligence Market Promising Growth Opportunities over By email protected Published February, All N... negative -0.030757 0.475267
In [15]:
news.isnull().sum()
Out[15]:
url                 0
date                0
language            0
title               0
text                0
year                0
month               0
day                 0
text_ner            0
text_cleaned        0
text_lemm           0
title_ner           0
title_cleaned       0
title_lemm          0
title_word_count    0
text_word_count     0
tblob_sent          0
tblob_score         0
tblob_sub           0
dtype: int64
In [16]:
news.to_parquet('news_tblob_sent.parquet')
In [17]:
# Google Cloud Storage details
bucket_name = 'nlp-final'
file_path = 'news_tblob_sent.parquet'  # This is the name the file will have in GCS
local_file_path = 'news_tblob_sent.parquet'  # Path to the local file you just saved

# Create a GCS Client
storage_client = storage.Client()

# Get the bucket
bucket = storage_client.get_bucket(bucket_name)

# Create a blob object from the filepath
blob = bucket.blob(file_path)

# Upload the file
blob.upload_from_filename(local_file_path)

3-(A). Sentiment over time: Polarity Score¶

3.1. Overall Sentiment (Average of Sentiment from Positive and Negative)¶

1. Sentiment Distribution¶

In [18]:
sentiment_counts = news['tblob_sent'].value_counts(ascending=False).reset_index()
sentiment_counts.columns = ['Sentiment', 'Count']
sentiment_counts
Out[18]:
Sentiment Count
0 positive 183719
1 negative 13710
2 neutral 635
In [19]:
# Create a bar plot
plt.figure(figsize=(7, 5))
sns.barplot(x='Sentiment', y='Count', data=sentiment_counts)

# Adding title and labels
plt.title('Sentiment Distribution from tblob Analysis')
plt.xlabel('Sentiment')
plt.ylabel('Count')

# Show the plot
plt.show()
No description has been provided for this image
In [20]:
abs_tblob_score = abs(news['tblob_score'])
In [21]:
abs_tblob_score.describe()
Out[21]:
count    198064.000000
mean          0.112337
std           0.068655
min           0.000000
25%           0.065693
50%           0.105343
75%           0.148454
max           1.000000
Name: tblob_score, dtype: float64
In [22]:
# Create a distplot
plt.figure(figsize=(7, 5))  # Set the size of the plot
sns.distplot(abs_tblob_score, bins=30, kde=True, hist_kws={'edgecolor':'black'})

# Customize the plot
plt.title('Distribution of Absolute tblob Polarity Scores')
plt.xlabel('Absolute Compound Score')
plt.ylabel('Density')

plt.show()
No description has been provided for this image
In [23]:
tblob_score = news['tblob_score']
In [24]:
tblob_score.describe()
Out[24]:
count    198064.000000
mean          0.104398
std           0.080214
min          -1.000000
25%           0.061382
50%           0.103688
75%           0.147511
max           1.000000
Name: tblob_score, dtype: float64
In [25]:
# Create a distplot
plt.figure(figsize=(7, 5))  # Set the size of the plot
sns.distplot(tblob_score, bins=30, kde=True, hist_kws={'edgecolor':'black'})

# Customize the plot
plt.title('Distribution of tblob Polarity Scores')
plt.xlabel('Absolute Compound Score')
plt.ylabel('Density')

plt.show()
No description has been provided for this image

2. Sentiment Overtime¶

Year¶

In [26]:
# Group by year and month, and calculate the average sentiment score for each month
yearly_sentiment = news.groupby('year')['tblob_score'].mean().reset_index()
yearly_sentiment.columns = ['Year', 'Average_Sentiment']
In [27]:
yearly_sentiment.head()
Out[27]:
Year Average_Sentiment
0 2020 0.077761
1 2021 0.090981
2 2022 0.114463
3 2023 0.110122
In [28]:
# Set the style
sns.set(style="white")

# Create a line plot
plt.figure(figsize=(10, 6))
sns.lineplot(data=yearly_sentiment, x='Year', y='Average_Sentiment', marker='o')

# Customize the plot
plt.title('Yearly Average Sentiment Trend', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
plt.xticks(yearly_sentiment['Year'])  # Ensure all years are shown as x-ticks

# Show the plot
plt.show()
No description has been provided for this image
In [29]:
# Set the style
sns.set(style="white")

# Create a line plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=news, x='year', y='tblob_score', marker='o')

# Customize the plot
plt.title('Yearly Average Sentiment Over Time', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90)  # Rotate x-ticks for better readability

# Show the plot
plt.show()
No description has been provided for this image

Month¶

In [30]:
# Group by year and month, and calculate the average sentiment score for each month
monthly_sentiment = news.groupby(['year', 'month'])['tblob_score'].mean().reset_index()
monthly_sentiment.columns = ['Year', 'Month', 'Average_Sentiment']
In [31]:
monthly_sentiment.head()
Out[31]:
Year Month Average_Sentiment
0 2020 1 0.085299
1 2020 2 0.073580
2 2020 3 0.060021
3 2020 4 0.061634
4 2020 5 0.078995
In [32]:
# Custom color palette
custom_colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]

# Create a larger figure size to prevent overlapping
plt.figure(figsize=(20, 10))

# Create a line plot with the custom color palette
sns.lineplot(x='Month', y='Average_Sentiment', hue='Year', data=monthly_sentiment, marker='o', palette=custom_colors)

# Add titles and labels
plt.title('Monthly Average Sentiment Over Time', fontsize=16)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
plt.xticks(range(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])  # Month labels from 1 to 12

# Move the legend outside of the plot
plt.legend(title='Year', bbox_to_anchor=(1.02, 1.02), loc='upper left')

# Adjust subplot parameters for better layout
# plt.subplots_adjust(right=0.8)

# Show the plot
plt.show()
No description has been provided for this image
In [33]:
monthly_sentiment['Year_Month'] = monthly_sentiment['Year'].astype(str).str.zfill(2) + '-' + monthly_sentiment['Month'].astype(str).str.zfill(2)
In [34]:
monthly_sentiment.head()
Out[34]:
Year Month Average_Sentiment Year_Month
0 2020 1 0.085299 2020-01
1 2020 2 0.073580 2020-02
2 2020 3 0.060021 2020-03
3 2020 4 0.061634 2020-04
4 2020 5 0.078995 2020-05
In [35]:
# Convert 'Year_Month' to a datetime format for better plotting
monthly_sentiment['Year_Month'] = pd.to_datetime(monthly_sentiment['Year_Month'])
In [36]:
# Set the style
sns.set(style="white")

# Create a line plot
plt.figure(figsize=(20, 10))
sns.lineplot(data=monthly_sentiment, x='Year_Month', y='Average_Sentiment', marker='o')

# Customize the plot
plt.title('Monthly Average Sentiment Over Time', fontsize=16)
plt.xlabel('Year-Month', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90)  # Rotate x-ticks for better readability

# Show the plot
plt.show()
No description has been provided for this image
In [37]:
# Set the style
sns.set(style="white")

# Create a line plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=news, x='month', y='tblob_score', marker='o')

# Customize the plot
plt.title('Monthly Average Sentiment', fontsize=16)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90)  # Rotate x-ticks for better readability

# Show the plot
plt.show()
No description has been provided for this image

Day¶

In [38]:
daily_sentiment = news.groupby(['year', 'month', 'day'])['tblob_score'].mean().reset_index()
daily_sentiment.columns = ['Year', 'Month', 'Day', 'Average_Sentiment']
In [39]:
daily_sentiment.head()
Out[39]:
Year Month Day Average_Sentiment
0 2020 1 1 0.087578
1 2020 1 2 0.099098
2 2020 1 3 0.111516
3 2020 1 4 0.118009
4 2020 1 5 0.117570
In [40]:
daily_sentiment['Month_Day'] = daily_sentiment['Month'].astype(str).str.zfill(2) + '-' + daily_sentiment['Day'].astype(str).str.zfill(2)
In [41]:
daily_sentiment.head()
Out[41]:
Year Month Day Average_Sentiment Month_Day
0 2020 1 1 0.087578 01-01
1 2020 1 2 0.099098 01-02
2 2020 1 3 0.111516 01-03
3 2020 1 4 0.118009 01-04
4 2020 1 5 0.117570 01-05
In [42]:
# Custom color palette
custom_colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]

# Set the style to white (no grid)
sns.set(style="white")

# Create a line plot with a larger figure size
plt.figure(figsize=(20, 10))
sns.lineplot(data=daily_sentiment, x='Month_Day', y='Average_Sentiment', hue='Year', palette=custom_colors)

# Customize the plot
plt.title('Daily Average Sentiment Trend By Year', fontsize=16)
plt.xlabel('Month-Day', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)

# Improve x-tick readability
# Show only the first day of each month or every few days
x_ticks = daily_sentiment['Month_Day'].unique()[::10]  # Adjust the step as needed
plt.xticks(x_ticks, rotation=90)  # Rotate x-ticks for better readability

# Place the legend outside the plot
plt.legend(title='Year', bbox_to_anchor=(1.01, 1.01), loc='upper left')

# Adjust subplot parameters for better layout
plt.subplots_adjust(right=0.8)

# Show the plot
plt.show()
No description has been provided for this image
In [43]:
daily_sentiment2 = news.groupby('date')['tblob_score'].mean().reset_index()
daily_sentiment2.columns = ['Date', 'Average_Sentiment']
In [44]:
daily_sentiment2.head()
Out[44]:
Date Average_Sentiment
0 2020-01-01 0.087578
1 2020-01-02 0.099098
2 2020-01-03 0.111516
3 2020-01-04 0.118009
4 2020-01-05 0.117570
In [45]:
# Set the style
sns.set(style="white")

# Create a line plot
plt.figure(figsize=(20, 10))
sns.lineplot(data=daily_sentiment2, x='Date', y='Average_Sentiment')

# Customize the plot
plt.title('Daily Average Sentiment Over Time', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
x_ticks = daily_sentiment2['Date'].unique()[::30]  # Adjust the step as needed
plt.xticks(x_ticks, rotation=90)  # Rotate x-ticks for better readability

# Show the plot
plt.show()
No description has been provided for this image
In [46]:
# Set the style
sns.set(style="white")

# Create a line plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=news, x='day', y='tblob_score', marker='o')

# Customize the plot
plt.title('Daily Average Sentiment', fontsize=16)
plt.xlabel('Day', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90)  # Rotate x-ticks for better readability

# Show the plot
plt.show()
No description has been provided for this image

3.2. Positive Sentiment (Average of Sentiment from Positive)¶

1. Sentiment Distribution¶

In [47]:
news_po = news[news['tblob_sent'] == 'positive']
In [48]:
po_tblob_score = news_po['tblob_score']
In [49]:
po_tblob_score.describe()
Out[49]:
count    1.837190e+05
mean     1.168289e-01
std      6.668608e-02
min      2.220446e-18
25%      7.167363e-02
50%      1.094301e-01
75%      1.515162e-01
max      1.000000e+00
Name: tblob_score, dtype: float64
In [50]:
# Create a distplot
plt.figure(figsize=(7, 5))  # Set the size of the plot
sns.distplot(po_tblob_score, bins=30, kde=True, hist_kws={'edgecolor':'black'})

# Customize the plot
plt.title('Distribution of tblob Polarity Scores from Positive Sentiment')
plt.xlabel('Compound Score')
plt.ylabel('Density')

plt.show()
No description has been provided for this image

2. Sentiment Overtime¶

Year¶

In [51]:
# Group by year and month, and calculate the average sentiment score for each month
yearly_sentiment = news_po.groupby('year')['tblob_score'].mean().reset_index()
yearly_sentiment.columns = ['Year', 'Average_Sentiment']
In [52]:
yearly_sentiment.head()
Out[52]:
Year Average_Sentiment
0 2020 0.107211
1 2021 0.111604
2 2022 0.124247
3 2023 0.117383
In [53]:
# Set the style
sns.set(style="white")

# Create a line plot
plt.figure(figsize=(10, 6))
sns.lineplot(data=yearly_sentiment, x='Year', y='Average_Sentiment', marker='o')

# Customize the plot
plt.title('Yearly Average Sentiment Trend from Positive Sentiment', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
plt.xticks(yearly_sentiment['Year'])  # Ensure all years are shown as x-ticks

# Show the plot
plt.show()
No description has been provided for this image
In [54]:
# Set the style
sns.set(style="white")

# Create a line plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=news_po, x='year', y='tblob_score', marker='o')

# Customize the plot
plt.title('Yearly Average Sentiment Over Time from Positive Sentiment', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90)  # Rotate x-ticks for better readability

# Show the plot
plt.show()
No description has been provided for this image

Month¶

In [55]:
# Group by year and month, and calculate the average sentiment score for each month
monthly_sentiment = news_po.groupby(['year', 'month'])['tblob_score'].mean().reset_index()
monthly_sentiment.columns = ['Year', 'Month', 'Average_Sentiment']
In [56]:
monthly_sentiment.head()
Out[56]:
Year Month Average_Sentiment
0 2020 1 0.106327
1 2020 2 0.101878
2 2020 3 0.101978
3 2020 4 0.103658
4 2020 5 0.107567
In [57]:
# Custom color palette
custom_colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]

# Create a larger figure size to prevent overlapping
plt.figure(figsize=(20, 10))

# Create a line plot with the custom color palette
sns.lineplot(x='Month', y='Average_Sentiment', hue='Year', data=monthly_sentiment, marker='o', palette=custom_colors)

# Add titles and labels
plt.title('Monthly Average Sentiment by Year from Positive Sentiment', fontsize=16)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
plt.xticks(range(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])  # Month labels from 1 to 12

# Move the legend outside of the plot
plt.legend(title='Year', bbox_to_anchor=(1.02, 1.02), loc='upper left')

# Adjust subplot parameters for better layout
# plt.subplots_adjust(right=0.8)

# Show the plot
plt.show()
No description has been provided for this image
In [58]:
monthly_sentiment['Year_Month'] = monthly_sentiment['Year'].astype(str).str.zfill(2) + '-' + monthly_sentiment['Month'].astype(str).str.zfill(2)
In [59]:
monthly_sentiment.head()
Out[59]:
Year Month Average_Sentiment Year_Month
0 2020 1 0.106327 2020-01
1 2020 2 0.101878 2020-02
2 2020 3 0.101978 2020-03
3 2020 4 0.103658 2020-04
4 2020 5 0.107567 2020-05
In [60]:
# Convert 'Year_Month' to a datetime format for better plotting
monthly_sentiment['Year_Month'] = pd.to_datetime(monthly_sentiment['Year_Month'])
In [61]:
# Set the style
sns.set(style="white")

# Create a line plot
plt.figure(figsize=(20, 10))
sns.lineplot(data=monthly_sentiment, x='Year_Month', y='Average_Sentiment', marker='o')

# Customize the plot
plt.title('Monthly Average Sentiment Over Time from Positive Sentiment', fontsize=16)
plt.xlabel('Year-Month', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90)  # Rotate x-ticks for better readability

# Show the plot
plt.show()
No description has been provided for this image
In [62]:
# Set the style
sns.set(style="white")

# Create a line plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=news_po, x='month', y='tblob_score', marker='o')

# Customize the plot
plt.title('Monthly Average Sentiment from Positive Sentiment', fontsize=16)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90)  # Rotate x-ticks for better readability

# Show the plot
plt.show()
No description has been provided for this image

Day¶

In [63]:
daily_sentiment = news_po.groupby(['year', 'month', 'day'])['tblob_score'].mean().reset_index()
daily_sentiment.columns = ['Year', 'Month', 'Day', 'Average_Sentiment']
In [64]:
daily_sentiment.head()
Out[64]:
Year Month Day Average_Sentiment
0 2020 1 1 0.094981
1 2020 1 2 0.125209
2 2020 1 3 0.128022
3 2020 1 4 0.128935
4 2020 1 5 0.127711
In [65]:
daily_sentiment['Month_Day'] = daily_sentiment['Month'].astype(str).str.zfill(2) + '-' + daily_sentiment['Day'].astype(str).str.zfill(2)
In [66]:
daily_sentiment.head()
Out[66]:
Year Month Day Average_Sentiment Month_Day
0 2020 1 1 0.094981 01-01
1 2020 1 2 0.125209 01-02
2 2020 1 3 0.128022 01-03
3 2020 1 4 0.128935 01-04
4 2020 1 5 0.127711 01-05
In [67]:
# Custom color palette
custom_colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]

# Set the style to white (no grid)
sns.set(style="white")

# Create a line plot with a larger figure size
plt.figure(figsize=(20, 10))
sns.lineplot(data=daily_sentiment, x='Month_Day', y='Average_Sentiment', hue='Year', palette=custom_colors)

# Customize the plot
plt.title('Daily Average Sentiment Trend by Year from Positive Sentiment', fontsize=16)
plt.xlabel('Month-Day', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)

# Improve x-tick readability
# Show only the first day of each month or every few days
x_ticks = daily_sentiment['Month_Day'].unique()[::10]  # Adjust the step as needed
plt.xticks(x_ticks, rotation=90)  # Rotate x-ticks for better readability

# Place the legend outside the plot
plt.legend(title='Year', bbox_to_anchor=(1.01, 1.01), loc='upper left')

# Adjust subplot parameters for better layout
plt.subplots_adjust(right=0.8)

# Show the plot
plt.show()
No description has been provided for this image
In [68]:
daily_sentiment2 = news_po.groupby('date')['tblob_score'].mean().reset_index()
daily_sentiment2.columns = ['Date', 'Average_Sentiment']
In [69]:
daily_sentiment2.head()
Out[69]:
Date Average_Sentiment
0 2020-01-01 0.094981
1 2020-01-02 0.125209
2 2020-01-03 0.128022
3 2020-01-04 0.128935
4 2020-01-05 0.127711
In [70]:
# Set the style
sns.set(style="white")

# Create a line plot
plt.figure(figsize=(20, 10))
sns.lineplot(data=daily_sentiment2, x='Date', y='Average_Sentiment')

# Customize the plot
plt.title('Daily Average Sentiment Over Time from Positive Sentiment', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
x_ticks = daily_sentiment2['Date'].unique()[::30]  # Adjust the step as needed
plt.xticks(x_ticks, rotation=90)  # Rotate x-ticks for better readability

# Show the plot
plt.show()
No description has been provided for this image
In [71]:
# Set the style
sns.set(style="white")

# Create a line plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=news_po, x='day', y='tblob_score', marker='o')

# Customize the plot
plt.title('Daily Average Sentiment from Positive Sentiment', fontsize=16)
plt.xlabel('Day', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90)  # Rotate x-ticks for better readability

# Show the plot
plt.show()
No description has been provided for this image

3.2. Positive Sentiment (Average of Sentiment from Positive)¶

1. Sentiment Distribution¶

In [72]:
news_ne = news[news['tblob_sent'] == 'negative']
In [73]:
ne_tblob_score = news_ne['tblob_score']
In [74]:
ne_tblob_score.describe()
Out[74]:
count    1.371000e+04
mean    -5.734629e-02
std      6.800852e-02
min     -1.000000e+00
25%     -7.720708e-02
50%     -3.826780e-02
75%     -1.558900e-02
max     -2.523234e-18
Name: tblob_score, dtype: float64
In [75]:
# Create a distplot
plt.figure(figsize=(7, 5))  # Set the size of the plot
sns.distplot(ne_tblob_score, bins=30, kde=True, hist_kws={'edgecolor':'black'})

# Customize the plot
plt.title('Distribution of tblob Polarity Scores from Negative Sentiment')
plt.xlabel('Compound Score')
plt.ylabel('Density')

plt.show()
No description has been provided for this image

2. Sentiment Overtime¶

Year¶

In [76]:
# Group by year and month, and calculate the average sentiment score for each month
yearly_sentiment = news_ne.groupby('year')['tblob_score'].mean().reset_index()
yearly_sentiment.columns = ['Year', 'Average_Sentiment']
In [77]:
yearly_sentiment.head()
Out[77]:
Year Average_Sentiment
0 2020 -0.068832
1 2021 -0.069226
2 2022 -0.055172
3 2023 -0.041761
In [78]:
# Set the style
sns.set(style="white")

# Create a line plot
plt.figure(figsize=(10, 6))
sns.lineplot(data=yearly_sentiment, x='Year', y='Average_Sentiment', marker='o')

# Customize the plot
plt.title('Yearly Average Sentiment Trend from Negative Sentiment', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
plt.xticks(yearly_sentiment['Year'])  # Ensure all years are shown as x-ticks

# Show the plot
plt.show()
No description has been provided for this image
In [79]:
# Set the style
sns.set(style="white")

# Create a line plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=news_ne, x='year', y='tblob_score', marker='o')

# Customize the plot
plt.title('Yearly Average Sentiment Over Time from Negative Sentiment', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90)  # Rotate x-ticks for better readability

# Show the plot
plt.show()
No description has been provided for this image

Month¶

In [80]:
# Group by year and month, and calculate the average sentiment score for each month
monthly_sentiment = news_ne.groupby(['year', 'month'])['tblob_score'].mean().reset_index()
monthly_sentiment.columns = ['Year', 'Month', 'Average_Sentiment']
In [81]:
monthly_sentiment.head()
Out[81]:
Year Month Average_Sentiment
0 2020 1 -0.062309
1 2020 2 -0.065608
2 2020 3 -0.068339
3 2020 4 -0.065841
4 2020 5 -0.068862
In [82]:
# Custom color palette
custom_colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]

# Create a larger figure size to prevent overlapping
plt.figure(figsize=(20, 10))

# Create a line plot with the custom color palette
sns.lineplot(x='Month', y='Average_Sentiment', hue='Year', data=monthly_sentiment, marker='o', palette=custom_colors)

# Add titles and labels
plt.title('Monthly Average Sentiment by Year from Negative Sentiment')
plt.xlabel('Month', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
plt.xticks(range(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])  # Month labels from 1 to 12

# Move the legend outside of the plot
plt.legend(title='Year', bbox_to_anchor=(1.02, 1.02), loc='upper left')

# Adjust subplot parameters for better layout
# plt.subplots_adjust(right=0.8)

# Show the plot
plt.show()
No description has been provided for this image
In [83]:
monthly_sentiment['Year_Month'] = monthly_sentiment['Year'].astype(str).str.zfill(2) + '-' + monthly_sentiment['Month'].astype(str).str.zfill(2)
In [84]:
monthly_sentiment.head()
Out[84]:
Year Month Average_Sentiment Year_Month
0 2020 1 -0.062309 2020-01
1 2020 2 -0.065608 2020-02
2 2020 3 -0.068339 2020-03
3 2020 4 -0.065841 2020-04
4 2020 5 -0.068862 2020-05
In [85]:
# Convert 'Year_Month' to a datetime format for better plotting
monthly_sentiment['Year_Month'] = pd.to_datetime(monthly_sentiment['Year_Month'])
In [86]:
# Set the style
sns.set(style="white")

# Create a line plot
plt.figure(figsize=(20, 10))
sns.lineplot(data=monthly_sentiment, x='Year_Month', y='Average_Sentiment', marker='o')

# Customize the plot
plt.title('Monthly Average Sentiment Over Time from Negative Sentiment', fontsize=16)
plt.xlabel('Year-Month', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90)  # Rotate x-ticks for better readability

# Show the plot
plt.show()
No description has been provided for this image
In [87]:
# Set the style
sns.set(style="white")

# Create a line plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=news_po, x='month', y='tblob_score', marker='o')

# Customize the plot
plt.title('Monthly Average Sentiment from Negative Sentiment', fontsize=16)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90)  # Rotate x-ticks for better readability

# Show the plot
plt.show()
No description has been provided for this image

Day¶

In [88]:
daily_sentiment = news_ne.groupby(['year', 'month', 'day'])['tblob_score'].mean().reset_index()
daily_sentiment.columns = ['Year', 'Month', 'Day', 'Average_Sentiment']
In [89]:
daily_sentiment.head()
Out[89]:
Year Month Day Average_Sentiment
0 2020 1 1 -0.082687
1 2020 1 2 -0.039064
2 2020 1 3 -0.081845
3 2020 1 4 -0.040412
4 2020 1 5 -0.041304
In [90]:
daily_sentiment['Month_Day'] = daily_sentiment['Month'].astype(str).str.zfill(2) + '-' + daily_sentiment['Day'].astype(str).str.zfill(2)
In [91]:
daily_sentiment.head()
Out[91]:
Year Month Day Average_Sentiment Month_Day
0 2020 1 1 -0.082687 01-01
1 2020 1 2 -0.039064 01-02
2 2020 1 3 -0.081845 01-03
3 2020 1 4 -0.040412 01-04
4 2020 1 5 -0.041304 01-05
In [92]:
# Custom color palette
custom_colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]

# Set the style to white (no grid)
sns.set(style="white")

# Create a line plot with a larger figure size
plt.figure(figsize=(20, 10))
sns.lineplot(data=daily_sentiment, x='Month_Day', y='Average_Sentiment', hue='Year', palette=custom_colors)

# Customize the plot
plt.title('Daily Average Sentiment Trend By Year from Negative Sentiment', fontsize=16)
plt.xlabel('Month-Day', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)

# Improve x-tick readability
# Show only the first day of each month or every few days
x_ticks = daily_sentiment['Month_Day'].unique()[::10]  # Adjust the step as needed
plt.xticks(x_ticks, rotation=90)  # Rotate x-ticks for better readability

# Place the legend outside the plot
plt.legend(title='Year', bbox_to_anchor=(1.01, 1.01), loc='upper left')

# Adjust subplot parameters for better layout
plt.subplots_adjust(right=0.8)

# Show the plot
plt.show()
No description has been provided for this image
In [93]:
daily_sentiment2 = news_ne.groupby('date')['tblob_score'].mean().reset_index()
daily_sentiment2.columns = ['Date', 'Average_Sentiment']
In [94]:
daily_sentiment2.head()
Out[94]:
Date Average_Sentiment
0 2020-01-01 -0.082687
1 2020-01-02 -0.039064
2 2020-01-03 -0.081845
3 2020-01-04 -0.040412
4 2020-01-05 -0.041304
In [95]:
# Set the style
sns.set(style="white")

# Create a line plot
plt.figure(figsize=(20, 10))
sns.lineplot(data=daily_sentiment2, x='Date', y='Average_Sentiment')

# Customize the plot
plt.title('Daily Average Sentiment Over Time from Negative Sentiment', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
x_ticks = daily_sentiment2['Date'].unique()[::30]  # Adjust the step as needed
plt.xticks(x_ticks, rotation=90)  # Rotate x-ticks for better readability

# Show the plot
plt.show()
No description has been provided for this image
In [96]:
# Set the style
sns.set(style="white")

# Create a line plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=news_ne, x='day', y='tblob_score', marker='o')

# Customize the plot
plt.title('Daily Average Sentiment from Negative Sentiment', fontsize=16)
plt.xlabel('Day', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90)  # Rotate x-ticks for better readability

# Show the plot
plt.show()
No description has been provided for this image

3-(B). Sentiment over time: Article Numbers¶

In [97]:
news.groupby('year')['tblob_score'].count()
Out[97]:
year
2020     22836
2021     28962
2022     36775
2023    109491
Name: tblob_score, dtype: int64
In [98]:
grouped_data_po = news_po.groupby('year')['tblob_score'].size().reset_index(name = 'count')
In [99]:
grouped_data_po.head()
Out[99]:
year count
0 2020 18884
1 2021 25638
2 2022 34741
3 2023 104456
In [100]:
# Set the style
sns.set(style="white")

# Create a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(data=grouped_data_po, x='year', y='count')

# Customize the plot
plt.title('News Article Count(Yearly) from Positive Sentiment', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Count', fontsize=14)

# Show the plot
plt.show()
No description has been provided for this image
In [101]:
grouped_data_ne = news_ne.groupby('year')['tblob_sent'].size().reset_index(name = 'count')
In [102]:
grouped_data_ne.head()
Out[102]:
year count
0 2020 3615
1 2021 3269
2 2022 1941
3 2023 4885
In [103]:
# Set the style
sns.set(style="white")

# Create a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(data=grouped_data_ne, x='year', y='count')

# Customize the plot
plt.title('News Article Count(Yearly) from Negative Sentiment', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Count', fontsize=14)

# Show the plot
plt.show()
No description has been provided for this image
In [104]:
# Create a pivot table
pivot_data = news.pivot_table(index='year', columns='tblob_sent', aggfunc='size', fill_value=0)
In [105]:
pivot_data.head()
Out[105]:
tblob_sent negative neutral positive
year
2020 3615 337 18884
2021 3269 55 25638
2022 1941 93 34741
2023 4885 150 104456
In [106]:
sns.set(style="white")

# Create a line plot
plt.figure(figsize=(10, 5))
sns.lineplot(data=pivot_data, markers=True, dashes=False)

# Customize the plot
plt.title('Yearly News Article Count by Sentiment', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Count', fontsize=14)

# Place the legend outside of the plot to the right
plt.legend(title='Sentiment', loc='upper left', bbox_to_anchor=(1.01, 1.02))

# Adjust subplot parameters to fit the legend
plt.subplots_adjust(right=0.75)

# Show the plot
plt.show()
No description has been provided for this image
In [107]:
# Combine year and month into a single column
news['year_month'] = news['year'].astype(str) + '-' + news['month'].astype(str).str.zfill(2)

grouped_data = news.groupby(['year_month', 'tblob_sent']).size().reset_index(name='count')
In [108]:
# Set the style
sns.set(style="white")

# Define the sentiments
sentiments = ['positive', 'negative', 'neutral']

# Create separate plots for each sentiment
for sentiment in sentiments:
    # Filter data for the current sentiment
    data_filtered = grouped_data[grouped_data['tblob_sent'] == sentiment]

    # Create a bar plot for the current sentiment
    plt.figure(figsize=(10, 5))
    barplot = sns.barplot(data=data_filtered, x='year_month', y='count')

    # Customize the plot
    plt.title(f'Monthly Article Count ({sentiment.capitalize()} Sentiment)', fontsize=16)
    plt.xlabel('Year-Month', fontsize=14)
    plt.ylabel('Article Count', fontsize=14)

    # Rotate and skim x-ticks
    xtick_labels = barplot.get_xticklabels()
    skim_factor = 5  # Adjust this value as needed to skip x-ticks
    barplot.set_xticklabels([label if i % skim_factor == 0 else '' for i, label in enumerate(xtick_labels)], rotation=90)

    # Place the legend outside of the plot
    # plt.legend(title='Sentiment', bbox_to_anchor=(1.01, 1.02), loc='upper left')

    # Show the plot
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [109]:
# Pivot the data for stacked bar plot
pivot_data = grouped_data.pivot(index='year_month', columns='tblob_sent', values='count').fillna(0)
In [110]:
# Extend the custom color palette
custom_colors = ["#1f77b4", "#ff7f0e", "#2ca02c"]  # Add more colors as needed

# Create a stacked bar plot with an adjusted figure size
plt.figure(figsize=(20, 10))
pivot_data.plot(kind='bar', stacked=True, color=custom_colors)

# Customize the plot
plt.title('Monthly Total Article Count with Sentiment Portions', fontsize=16)
plt.xlabel('Year-Month', fontsize=14)
plt.ylabel('Total Article Count', fontsize=14)

# Rotate and skim x-ticks
plt.xticks(rotation=90)
xtick_labels = plt.gca().get_xticklabels()
skim_factor = 5  # Adjust this value as needed
plt.gca().set_xticklabels([label if i % skim_factor == 0 else '' for i, label in enumerate(xtick_labels)])

# Place the legend outside of the plot
plt.legend(title='Sentiment', bbox_to_anchor=(1.01, 1.02), loc='upper left')

# Show the plot
plt.show()
<Figure size 2000x1000 with 0 Axes>
No description has been provided for this image

4. Word Count¶

4.1. Original Data¶

In [111]:
# Set the style
sns.set(style="white")

# Create a box plot
plt.figure(figsize=(18, 8))
sns.boxplot(data=news, x='tblob_sent', y='text_word_count')

# Customize the plot
plt.title('Word Count Distribution by Sentiment', fontsize=16)
plt.xlabel('Sentiment', fontsize=14)
plt.ylabel('Word Count', fontsize=14)

# Show the plot
plt.show()
No description has been provided for this image
In [112]:
# Create a violin plot
plt.figure(figsize=(12, 8))
sns.violinplot(data=news, x='tblob_sent', y='text_word_count')

# Customize the plot
plt.title('Word Count Distribution by Sentiment', fontsize=16)
plt.xlabel('Sentiment', fontsize=14)
plt.ylabel('Word Count', fontsize=14)

# Show the plot
plt.show()
No description has been provided for this image
In [113]:
plt.figure(figsize=(10, 6))

sentiments = news['tblob_sent'].unique()  # Get unique sentiment categories

# Define custom colors for each sentiment category
colors = ['green', 'red', 'gray']  # Adjust the number of colors based on the number of sentiment categories

for i, sentiment in enumerate(sentiments):
    data = news[news['tblob_sent'] == sentiment]  # Filter data for each sentiment category
    sns.histplot(data=data, x='text_word_count', label=sentiment, color=colors[i], bins=30, stat='density', element='step')

plt.xlabel('Text Word Count')
plt.ylabel('Density')
plt.title('Distribution of Text Word Count by Sentiment')
plt.legend(title='Sentiment')
plt.show()
No description has been provided for this image

4.2. Data without Outliers¶

In [114]:
plt.figure(figsize=(12, 8))
sns.boxplot(data=news, x='tblob_sent', y='text_word_count', showfliers=False)
plt.title('Word Count Distribution by Sentiment (Without Outliers)')
plt.xlabel('Sentiment')
plt.ylabel('Word Count')
plt.show()
No description has been provided for this image
In [115]:
plt.figure(figsize=(12, 8))
sns.violinplot(data=news, x='tblob_sent', y='text_word_count', cut=0)
plt.title('Word Count Distribution by Sentiment (Violin Plot)')
plt.xlabel('Sentiment')
plt.ylabel('Word Count')
plt.show()
No description has been provided for this image
In [116]:
news[news['tblob_sent'] == 'positive']['text_word_count'].describe()
Out[116]:
count    183719.000000
mean        818.312766
std         619.243402
min           6.000000
25%         490.000000
50%         673.000000
75%         995.000000
max       29325.000000
Name: text_word_count, dtype: float64
In [117]:
news[news['tblob_sent'] == 'negative']['text_word_count'].describe()
Out[117]:
count    13710.000000
mean       680.988330
std        409.065458
min          4.000000
25%        413.000000
50%        642.000000
75%        873.000000
max      10083.000000
Name: text_word_count, dtype: float64
In [118]:
news[news['tblob_sent'] == 'neutral']['text_word_count'].describe()
Out[118]:
count     635.000000
mean       46.858268
std       149.197745
min         3.000000
25%        10.000000
50%        12.000000
75%        15.000000
max      1320.000000
Name: text_word_count, dtype: float64
In [119]:
def calculate_outlier_thresholds(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return lower_bound, upper_bound
In [120]:
# Calculate thresholds for each sentiment category
positive_thresholds = calculate_outlier_thresholds(news[news['tblob_sent'] == 'positive']['text_word_count'])
negative_thresholds = calculate_outlier_thresholds(news[news['tblob_sent'] == 'negative']['text_word_count'])
neutral_thresholds = calculate_outlier_thresholds(news[news['tblob_sent'] == 'neutral']['text_word_count'])
In [121]:
print(positive_thresholds)
print(negative_thresholds)
print(neutral_thresholds)
(-267.5, 1752.5)
(-277.0, 1563.0)
(2.5, 22.5)
In [122]:
# Set the style
sns.set(style="white")

# Define the figure size
plt.figure(figsize=(10, 6))

# Filter out text_word_count values exceeding 2000
filtered_news = news[news['text_word_count'] <= 2000]

# Define custom colors for each sentiment category
colors = ['green', 'red', 'gray']  # Make sure the number of colors matches the number of sentiment categories

# Get unique sentiment categories
sentiments = filtered_news['tblob_sent'].unique()

# Plot overlapping histograms for each sentiment category
for sentiment, color in zip(sentiments, colors):
    # Filter data for each sentiment category
    data = filtered_news[filtered_news['tblob_sent'] == sentiment]['text_word_count']
    sns.histplot(data, label=sentiment, color=color, element='step', stat='count', common_norm=False, binwidth=50)

# Customize the plot
plt.xlabel('Text Word Count')
plt.ylabel('Article Count')
plt.title('Distribution of Text Word Count by Sentiment (Word Count ≤ 2000)')

# Place the legend outside the plot
plt.legend(title='Sentiment', bbox_to_anchor=(1.01, 1.02), loc='upper left')

# Show the plot
plt.tight_layout()  # Adjust the layout
plt.show()
No description has been provided for this image

5. Word Cloud¶

In [123]:
from wordcloud import WordCloud
In [124]:
# Function to generate word cloud
def generate_wordcloud(text, title):
    wordcloud = WordCloud(width = 800, height = 400, background_color ='white').generate(text)
    plt.figure(figsize = (10, 5), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.title(title, fontsize=20)
    plt.show()
In [125]:
# Replace 'positive' with 'negative' or 'neutral' as needed
sentiment_text = " ".join(text for text in news[news['tblob_sent'] == 'positive']['text_lemm'])

# Generate and plot the word cloud
wordcloud_sentiment = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(sentiment_text)

plt.figure(figsize=(12, 8))
plt.imshow(wordcloud_sentiment, interpolation='bilinear')
plt.title('Positive Sentiment Word Cloud', fontsize=20)
plt.axis('off')
plt.show()
No description has been provided for this image
In [126]:
sentiment_text = " ".join(text for text in news[news['tblob_sent'] == 'negative']['text_lemm'])

# Generate and plot the word cloud
wordcloud_sentiment = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(sentiment_text)

plt.figure(figsize=(12, 8))
plt.imshow(wordcloud_sentiment, interpolation='bilinear')
plt.title('Negative Sentiment Word Cloud', fontsize=20)
plt.axis('off')
plt.show()
No description has been provided for this image
In [127]:
sentiment_text = " ".join(text for text in news[news['tblob_sent'] == 'neutral']['text_lemm'])

# Generate and plot the word cloud
wordcloud_sentiment = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(sentiment_text)

plt.figure(figsize=(12, 8))
plt.imshow(wordcloud_sentiment, interpolation='bilinear')
plt.title('Neutral Sentiment Word Cloud', fontsize=20)
plt.axis('off')
plt.show()
No description has been provided for this image